HFD Intestine Proteomics analysis - 01 - Filtering and Imputation

Author

Falk Farkas

Published

August 6, 2025

1 Experimental Setup

Experimental setup:

Comparison of proteome between human exocrine pancreatic tissue left over from an islet preparation and cultured human ductal organoids. Samples were send to the Proteomics core by Christos. 4 technical replicates for the Organoids and 4 for the human prep tissue.

1.1 Saving

Code
path_figures <- paste(path_results, "Analysis","Plots", sep = "/")
########################


########################

save = T

#######################



if (save) {
  dir.create(paste(path_figures,  date, sep = "/"))

  fig_dir = paste(path_figures,  date, sep = "/")
  paste("Saving plots into folder:" , fig_dir)
  figname <- paste("KAC386_Human_organoids")
}
Warning in dir.create(paste(path_figures, date, sep = "/")):
'/Users/falk.farkas/Documents/Proteomics_analysis_Christos_Falk/Pig_Organoids/Analysis/Plots/20250806'
already exists
Code
print(c("Saving plots into folder is:" , save))
[1] "Saving plots into folder is:" "TRUE"                        

2 Load Data

Dataset contains the ECM and Soluble fraction for both, Colon and Jejunum.

Code
filename <- "KAK386_Pig_BSG_PGSpec_Spec20.xlsx"

samples <- read_excel(paste(path_results, filename, sep = "/"), col_names = T , sheet = "Sample sheet")



data <- read_excel(paste(path_results, "20250806_clean_data_KAC386_Pig_organoids.xlsx", sep = "/"), col_names = T )
dim(data)
[1] 8433   18
Code
data %>% head
# A tibble: 6 × 18
  Accession  Genes  Description    Min_Peptides_quantif…¹ Max_Peptides_quantif…²
  <chr>      <chr>  <chr>                           <dbl>                  <dbl>
1 A0A023VTS2 SAMHD1 Deoxynucleosi…                     21                     39
2 A0A076V355 FZD6   Frizzled-6                          5                     10
3 A0A0B8RS50 ZNF654 Zinc finger p…                      5                      6
4 A0A0B8RS56 XRN1   5'-3' exoribo…                     17                     37
5 A0A0B8RS58 WBP11  WW domain bin…                     12                     17
6 A0A0B8RS77 TEC    Tyrosine-prot…                      0                      4
# ℹ abbreviated names: ¹​Min_Peptides_quantified, ²​Max_Peptides_quantified
# ℹ 13 more variables: Min_Peptides_identified <dbl>, CK1_3 <dbl>, CK1_2 <dbl>,
#   CK2 <dbl>, CK3 <dbl>, CK4 <dbl>, CK5 <dbl>, CK6 <dbl>, CK7 <dbl>,
#   CK8 <dbl>, CK9 <dbl>, CK10 <dbl>, CK11 <dbl>
Code
sample_color_pallette <- c(viridis(3,begin = 0.005, end = 0.40), viridis(4,begin = 0.55, end = 0.97))

sample_color <- c( "#99d8c9","#238A8DFF", "#2ca25f", "#deebf7" ,"#a6bddb", "#2b8cbe" ,"#fde0dd" ,"#fa9fb5","#c51b8a","#FDE725FF", "#fec44f","#e34a33" )

names(sample_color) <- samples %>% pull(Sample)
sample_color_vec <- sample_color

3 Start wit DEP2 to calculate the differential abundances

3.1 Remove NA and make gene names unique

Code
paste("How many of the total Accessions in the dataset are unique per segment and fraction?")
[1] "How many of the total Accessions in the dataset are unique per segment and fraction?"
Code
data  %>% summarise(Unique_Accession= n_distinct( (Accession)),
                                                 Accession_count = length(Accession), .groups = "drop") 
# A tibble: 1 × 2
  Unique_Accession Accession_count
             <int>           <int>
1             8433            8433
Code
paste("How many of the total Genes in the dataset are unique per segment and fraction?")
[1] "How many of the total Genes in the dataset are unique per segment and fraction?"
Code
data  %>% summarise(Unique_Genes= n_distinct( (Genes)),
                                                 Gene_count = length(Genes), .groups = "drop") 
# A tibble: 1 × 2
  Unique_Genes Gene_count
         <int>      <int>
1         8433       8433
Code
paste("All Genes and Accessions are Unique!")
[1] "All Genes and Accessions are Unique!"

Data is good to continue with.

3.2 Experimental Design

Code
# generate a experimental design dataframe to establish a SummarizedExperiment
columns <- samples %>% pull(Sample) 


columns
 [1] "CK1_3" "CK1_2" "CK2"   "CK3"   "CK4"   "CK5"   "CK6"   "CK7"   "CK8"  
[10] "CK9"   "CK10"  "CK11" 
Code
exp_design <- samples %>% dplyr::rename(
  label = Sample,
  replicate = Replicate
) %>% select(-Condition)


print("Esperimental Design:")
[1] "Esperimental Design:"
Code
exp_design
# A tibble: 12 × 3
   label replicate condition
   <chr>     <dbl> <chr>    
 1 CK1_3         1 Embryo   
 2 CK1_2         2 Embryo   
 3 CK2           3 Embryo   
 4 CK3           1 PN_earlyP
 5 CK4           2 PN_earlyP
 6 CK5           3 PN_earlyP
 7 CK6           1 PN_lateP 
 8 CK7           2 PN_lateP 
 9 CK8           3 PN_lateP 
10 CK9           1 Adult    
11 CK10          2 Adult    
12 CK11          3 Adult    

3.3 Generate SummarizedExperiment for each split

Code
data_backup <- data
Code
# Step 2: Apply make_unique and make_se to each split
data %>%    make_unique("Genes", "Accession", delim = "_") %>%    make_se(columns = columns, expdesign = exp_design, log2transform = TRUE ) -> data_se

colData(data_se)$label <- factor( colData(data_se)$label,  levels = names(sample_color) )
colData(data_se)
DataFrame with 12 rows and 4 columns
               label            ID replicate   condition
            <factor>   <character> <numeric> <character>
Embryo_1       CK1_3      Embryo_1         1      Embryo
Embryo_2       CK1_2      Embryo_2         2      Embryo
Embryo_3       CK2        Embryo_3         3      Embryo
PN_earlyP_1    CK3   PN_earlyP_...         1   PN_earlyP
PN_earlyP_2    CK4   PN_earlyP_...         2   PN_earlyP
...              ...           ...       ...         ...
PN_lateP_2      CK7     PN_lateP_2         2    PN_lateP
PN_lateP_3      CK8     PN_lateP_3         3    PN_lateP
Adult_1         CK9        Adult_1         1       Adult
Adult_2         CK10       Adult_2         2       Adult
Adult_3         CK11       Adult_3         3       Adult

4 Filtering

4.1 Missing Values pre filtering

Code
  plot_missval(data_se)

4.2 Filtering of missing values

Code
data_filter <- filter_se(data_se, thr = 1, fraction = 2/12)
filter base on missing number is <= 1 in at least one condition.
filter base on missing number fraction < 0.166666666666667 in each row
Code
  plot_missval(data_filter)

4.3 plotting frequencies values before and after filtering

Code
p <- plot_frequency(data_se)+
  ggtitle(paste("Identification overlap before filter"), subtitle = paste("Total Proteins:" , nrow(data_se)))+
            scale_x_discrete(limits = as.character(1:8))


pp <- plot_frequency(data_filter)+ 
  ggtitle(paste("Identification overlap after filter"), subtitle = paste("Total Proteins:" , nrow(data_filter)))+
            scale_x_discrete(limits = as.character(1:8) )
                             
print(p/pp)                          

5 Equalize Variances between samples - Normalization

Code
data_filter %>% normalize_vsn() -> data_norm

p<- plot_normalization(data_filter,data_norm) +
    ggtitle( paste("Normalization of Sample Variance"), subtitle = paste("Total Proteins:" , nrow(data_norm))) 

print(p)

6 Selecting Imputation Method

Code
set.seed(09051997)

6.1 Missing at Random or not at random

Are the missing values missing randomly dispersed through the data, or is missingness more of a feature because the different samples acquire different trajectories and hence different proteomes. Here, it is more MNAR (missing not at random), a lot of proteins are either present in Organoids or in Tissue.

6.2 MNAR vs. MAR

Code
data_norm %>% plot_missval()

6.3 Imputing with different methods

Loading required namespace: imputeLCMD
Imputing along margin 2 (samples/columns).
Iteration 1 start...end!
Iteration 2 start...end!
Iteration 3 start...end!
Iteration 4 start...end!
Iteration 5 start...end!
Iteration 6 start...end!
Iteration 7 start...end!
Iteration 8 start...end!
Iteration 9 start...end!
Iteration 10 start...end!
Iteration 11 start...end!
Iteration 12 start...end!
Iteration 13 start...end!
Iteration 14 start...end!
Iteration 15 start...end!
Imputing along margin 2 (samples/columns).
[1] 0.2282605
Imputing along margin 1 (features/rows).
Cluster size 8252 broken into 3017 5235 
Cluster size 3017 broken into 2044 973 
Cluster size 2044 broken into 859 1185 
Done cluster 859 
Done cluster 1185 
Done cluster 2044 
Done cluster 973 
Done cluster 3017 
Cluster size 5235 broken into 3627 1608 
Cluster size 3627 broken into 1755 1872 
Cluster size 1755 broken into 949 806 
Done cluster 949 
Done cluster 806 
Done cluster 1755 
Cluster size 1872 broken into 907 965 
Done cluster 907 
Done cluster 965 
Done cluster 1872 
Done cluster 3627 
Cluster size 1608 broken into 902 706 
Done cluster 902 
Done cluster 706 
Done cluster 1608 
Done cluster 5235 
randomForest 4.7-1.2
Type rfNews() to see new features/changes/bug fixes.

Attaching package: 'randomForest'
The following object is masked from 'package:MSnbase':

    combine
The following object is masked from 'package:Biobase':

    combine
The following object is masked from 'package:BiocGenerics':

    combine
The following object is masked from 'package:dplyr':

    combine
The following object is masked from 'package:ggplot2':

    margin
Loading required package: foreach

Attaching package: 'foreach'
The following objects are masked from 'package:purrr':

    accumulate, when
Loading required package: rngtools
Imputing along margin 2 (samples/columns).
Code
NAs <- is.na(assay(norm_pg_sample)) 
## the imputed values by different methods.
imps <- list("GSimp" = imp_pg_GSimp, "QRILC" = imp_pg_QRILC, "MinProb" = imp_pg_MinProb, "RF" = imp_pg_RF, "knn" = imp_pg_knn) %>% 
  lapply(function(se){
    x = assay(se) %>% data.frame %>% gather("label", "value") %>% 
      left_join(colData(se)[c("label","condition")],copy = T) %>%
      magrittr::extract(as.vector(NAs),)
  }) %>% data.table::rbindlist(idcol = "method")
Joining with `by = join_by(label)`
Joining with `by = join_by(label)`
Joining with `by = join_by(label)`
Joining with `by = join_by(label)`
Joining with `by = join_by(label)`
Code
## the original normalized values without imputation
nonimps <- assay(norm_pg_sample) %>% data.frame %>% gather("label", "value") %>%
  left_join(colData(norm_pg_sample)[c("label","condition")],copy = T) %>% 
  magrittr::extract(!as.vector(NAs),) %>% mutate(method = "non_impute") %>%
  dplyr::select(method,everything())
Joining with `by = join_by(label)`
Code
library(ggridges)
ggplot(rbind(imps, nonimps),aes(x = value,y = factor(method,level = unique(method)))) + 
  geom_density_ridges(fill = "#027ad450", scale = 1.2,
                      jittered_points = TRUE,position = position_points_jitter(height = 0),
                      point_shape = '|', point_size = 2, point_alpha = 1, alpha = 0.7) +
  ylab("Impute method")+ ylab("Log2 value") + xlim(c(9,39))+
  theme_DEP1()+ggtitle("Imputation Effect on Sample Soluble_Jejunum")
Picking joint bandwidth of 0.234

Conclusion:

More Trees in the Random forrest hardly make a difference (RF vs RF2). KNN is imputing too right biased, as expected, since it is more suitable for the MAR values. For Me, GSimp seems to work best, especially since here it is quite clear, that values are MNAR, so a far left shifted imputation is desired.

6.4 Evaluate effect of Imputation on PCA for one example

Code
imp_list <- list("GSimp" = imp_pg_GSimp,
                 "QRILC" = imp_pg_QRILC,
                 "MinProb" = imp_pg_MinProb,
                 "RF" = imp_pg_RF,
                 #"RF2" = imp_pg_RF2,
                 "knn" = imp_pg_knn,
                 "unimputed" = norm_pg_sample
  
)


for (name in names(imp_list)){
  p <- plot_pca(imp_list[[name]], label = F, n = 1500, features = "Proteins", indicate = c("label", "condition"))+
    scale_colour_manual(values = sample_color_vec)+
    theme_cowplot()+
    ggtitle(paste("PCA imputed"), subtitle = paste("Method = ", name))+
    theme(aspect.ratio = 1)
  
  print(p)

  
}

Imputation Using Gsimp seems most appropriate to keep variance similar to Unimputed values.

7 Impute

Deep Learning based imputation using the random gibbs sampler embedded in the GSIMP function.

Code
data_norm %>% DEP2::impute(. , fun = "GSimp", hi_q = 0.1,
                       iters_each=100, iters_all=20) -> data_imp
Imputing along margin 2 (samples/columns).
Iteration 1 start...end!
Iteration 2 start...end!
Iteration 3 start...end!
Iteration 4 start...end!
Iteration 5 start...end!
Iteration 6 start...end!
Iteration 7 start...end!
Iteration 8 start...end!
Iteration 9 start...end!
Iteration 10 start...end!
Iteration 11 start...end!
Iteration 12 start...end!
Iteration 13 start...end!
Iteration 14 start...end!
Iteration 15 start...end!
Iteration 16 start...end!
Iteration 17 start...end!
Iteration 18 start...end!
Iteration 19 start...end!
Iteration 20 start...end!

8 Testing significant differences between conditions

BH correction of pval for multiple testing

Code
data_imp %>%  test_diff(type = "all", control = "Embryo", fdr.type = "BH") %>%
    add_rejections(alpha = 0.05, lfc = 1) -> data_test# alpha is the significance level, lfc is the log fold change threshold
Tested contrasts: PN_earlyP_vs_Embryo, PN_lateP_vs_Embryo, Adult_vs_Embryo, PN_earlyP_vs_PN_lateP, PN_earlyP_vs_Adult, PN_lateP_vs_Adult
BH

###save new dataframe with added significances

Code
tot_data <- tibble()


new_results <- get_results(data_test) %>% as_tibble() %>% mutate(Gene_ID = paste(name, ID, sep = "_"), .before = name)

new_results %>% head
# A tibble: 6 × 32
  Gene_ID               name  ID    Adult_vs_Embryo_p.val PN_earlyP_vs_Adult_p…¹
  <chr>                 <chr> <chr>                 <dbl>                  <dbl>
1 SAMHD1_A0A023VTS2     SAMH… A0A0…                0.250                  0.968 
2 PYCARD_A0A0B8RTB5     PYCA… A0A0…                0.0786                 0.519 
3 MMP19_A0A0B8RTG5      MMP19 A0A0…                0.242                  0.531 
4 SKA3;ZDHHC20_A0A0B8R… SKA3… A0A0…                0.776                  0.459 
5 PLCXD1_A0A0B8RVL3     PLCX… A0A0…                0.0138                 0.0813
6 CPT1A_A0A0B8RVZ5      CPT1A A0A0…                0.140                  0.662 
# ℹ abbreviated name: ¹​PN_earlyP_vs_Adult_p.val
# ℹ 27 more variables: PN_earlyP_vs_Embryo_p.val <dbl>,
#   PN_earlyP_vs_PN_lateP_p.val <dbl>, PN_lateP_vs_Adult_p.val <dbl>,
#   PN_lateP_vs_Embryo_p.val <dbl>, Adult_vs_Embryo_p.adj <dbl>,
#   PN_earlyP_vs_Adult_p.adj <dbl>, PN_earlyP_vs_Embryo_p.adj <dbl>,
#   PN_earlyP_vs_PN_lateP_p.adj <dbl>, PN_lateP_vs_Adult_p.adj <dbl>,
#   PN_lateP_vs_Embryo_p.adj <dbl>, Adult_vs_Embryo_significant <lgl>, …
Code
data_backup %>% as_tibble() %>% mutate(Gene_ID = paste(Genes, Accession, sep = "_"), .before = Accession) %>% dplyr::filter( Genes %in% rownames(assay(data_test))) -> data2


imputed_Data <- assay(data_test)
rename_map <- colData(data_test)$ID
names(rename_map) <- colData(data_test)$label %>% paste0( "_normalized_imputed")

imputed_Data %<>% dplyr::rename(!!!rename_map)

imputed_Data %<>% mutate(Genes = rownames(.))

left_join(new_results, data2, by = "Gene_ID") %>% left_join(. , imputed_Data , by = "Genes") -> tot_data


colSums(is.na(tot_data))
                          Gene_ID                              name 
                                0                                 0 
                               ID             Adult_vs_Embryo_p.val 
                                0                                 0 
         PN_earlyP_vs_Adult_p.val         PN_earlyP_vs_Embryo_p.val 
                                0                                 0 
      PN_earlyP_vs_PN_lateP_p.val           PN_lateP_vs_Adult_p.val 
                                0                                 0 
         PN_lateP_vs_Embryo_p.val             Adult_vs_Embryo_p.adj 
                                0                                 0 
         PN_earlyP_vs_Adult_p.adj         PN_earlyP_vs_Embryo_p.adj 
                                0                                 0 
      PN_earlyP_vs_PN_lateP_p.adj           PN_lateP_vs_Adult_p.adj 
                                0                                 0 
         PN_lateP_vs_Embryo_p.adj       Adult_vs_Embryo_significant 
                                0                                 0 
   PN_earlyP_vs_Adult_significant   PN_earlyP_vs_Embryo_significant 
                                0                                 0 
PN_earlyP_vs_PN_lateP_significant     PN_lateP_vs_Adult_significant 
                                0                                 0 
   PN_lateP_vs_Embryo_significant                       significant 
                                0                                 0 
            Adult_vs_Embryo_ratio          PN_earlyP_vs_Adult_ratio 
                                0                                 0 
        PN_earlyP_vs_Embryo_ratio       PN_earlyP_vs_PN_lateP_ratio 
                                0                                 0 
          PN_lateP_vs_Adult_ratio          PN_lateP_vs_Embryo_ratio 
                                0                                 0 
                   Adult_centered                   Embryo_centered 
                                0                                 0 
               PN_earlyP_centered                 PN_lateP_centered 
                                0                                 0 
                        Accession                             Genes 
                                0                                 0 
                      Description           Min_Peptides_quantified 
                                0                                 0 
          Max_Peptides_quantified           Min_Peptides_identified 
                                0                                 0 
                            CK1_3                             CK1_2 
                              155                               214 
                              CK2                               CK3 
                              160                                94 
                              CK4                               CK5 
                              302                               227 
                              CK6                               CK7 
                              802                               634 
                              CK8                               CK9 
                              835                               254 
                             CK10                              CK11 
                              274                               187 
         CK1_3_normalized_imputed          CK1_2_normalized_imputed 
                                0                                 0 
           CK2_normalized_imputed            CK3_normalized_imputed 
                                0                                 0 
           CK4_normalized_imputed            CK5_normalized_imputed 
                                0                                 0 
           CK6_normalized_imputed            CK7_normalized_imputed 
                                0                                 0 
           CK8_normalized_imputed            CK9_normalized_imputed 
                                0                                 0 
          CK10_normalized_imputed           CK11_normalized_imputed 
                                0                                 0 
Code
library(writexl)

if (save) {
  write_xlsx(x = tot_data, path = paste(path_results, paste(date, "KAK386_Complete_results_with_stats.xlsx", sep = ""), sep = "/"))
}

9 PCA

Code
  p<-plot_pca(data_test, label = F, n = 1500, features = "Proteins", indicate = c("label", "condition"))+
    scale_colour_manual(values = sample_color_vec)+
    theme_cowplot()+
    ggtitle(paste("PCA of top 1500 proteins"), subtitle = paste("Total Proteins:" , nrow(data_test)))+
  theme(aspect.ratio = 1)


  print(p)

Code
if (save) {
    filename <- paste(date, figname, c("PCA_1500.pdf"), sep = "_", collapse = "")
    ggsave(filename, device = NULL , path = fig_dir)
  }
Saving 10 x 5 in image

9.1 PCA top 500

Code
  p<-plot_pca(data_test, label = F, n = 500, features = "Proteins", indicate = c("label", "condition"))+
    scale_colour_manual(values = sample_color_vec)+
    theme_cowplot()+
    ggtitle(paste("PCA of top 500 proteins"), subtitle = paste("Total Proteins:" , nrow(data_test)))+
  theme(aspect.ratio = 1)

  print(p)

Code
  if (save) {
    filename <- paste(date, figname, c("PCA_500.pdf"), sep = "_", collapse = "")
    ggsave(filename,  device = NULL , path = fig_dir)
  }
Saving 10 x 5 in image

9.2 PCA top 150

Code
  p<-plot_pca(data_test, label = F, n = 150, features = "Proteins", indicate = c("label", "condition"))+
    scale_colour_manual(values = sample_color_vec)+
    theme_cowplot()+
    ggtitle(paste("PCA of top 150 proteins"), subtitle = paste("Total Proteins:" , nrow(data_test)))+
  theme(aspect.ratio = 1) 
  
  
    print(p)

Code
  if (save) {
    filename <- paste(date,figname, c("PCA_150.pdf"), sep = "_", collapse = "")
    ggsave(filename,  device = NULL , path = fig_dir)
  }
Saving 10 x 5 in image

9.3 PCA top 50

Code
  p<-plot_pca(data_test, label = F, n = 50, features = "Proteins", indicate = c("label", "condition"))+
    scale_colour_manual(values = sample_color_vec)+
    theme_cowplot()+
    ggtitle(paste("PCA of top 50 proteins"), subtitle = paste("Total Proteins:" , nrow(data_test)))+
  theme(aspect.ratio = 1)


  print(p)

Code
  if (save) {
    filename <- paste(date, figname, c("PCA_50.pdf"), sep = "_", collapse = "")
    ggsave(filename,  device = NULL , path = fig_dir)
  }
Saving 10 x 5 in image
Code
  print(p)

9.4 all on one page

Code
features <- c(50,150,500,1500)
plot_list <- list()
for (feature in features) {
  p <-   p<-plot_pca(data_test, label = F, n = feature, features = "Proteins", indicate = c("label", "condition"))+
    scale_colour_manual(values = sample_color_vec)+
    theme_cowplot(font_size = 6)+
    ggtitle(paste("PCA of top", feature, "proteins"))+
    theme(aspect.ratio = 1)
  p[feature] <- p
}
Warning in p[feature] <- p: number of items to replace is not a multiple of
replacement length
Warning in p[feature] <- p: number of items to replace is not a multiple of
replacement length
Warning in p[feature] <- p: number of items to replace is not a multiple of
replacement length
Warning in p[feature] <- p: number of items to replace is not a multiple of
replacement length
Code
if (save){
    library(gridExtra)  # for grid.arrange
  
  # Arrange 4 plots per page (adjust per your preference)
  n_per_page <- 4
  n_pages <- ceiling(length(plot_list) / n_per_page)
  
  pdf(file = file.path(fig_dir, paste0(date, "_PCA_1500_tiled.pdf")), width = 12, height = 8)
  
  for (i in seq_len(n_pages)) {
    idx_start <- (i - 1) * n_per_page + 1
    idx_end <- min(i * n_per_page, length(plot_list))
    plots_to_print <- plot_list[idx_start:idx_end]
    
    grid.arrange(grobs = plots_to_print, ncol = 2)
  }

  dev.off()
}

Attaching package: 'gridExtra'
The following object is masked from 'package:randomForest':

    combine
The following object is masked from 'package:MSnbase':

    combine
The following object is masked from 'package:Biobase':

    combine
The following object is masked from 'package:BiocGenerics':

    combine
The following object is masked from 'package:dplyr':

    combine
quartz_off_screen 
                2 

10 Correlation

Code
plot_cor(data_test)

Code
  if (save) {
    filename <- paste(date, figname, c("correlation.pdf"), sep = "_", collapse = "")
    ggsave(filename,  device = pdf() , path = fig_dir)
  dev.off()
  }
Saving 7 x 7 in image
quartz_off_screen 
                2